In [5]:
# Import packages (same from Workbook 3)
import glob
import random
from typing import List
from collections import defaultdict
import numpy as np
from numpy.random import choice
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from symusic import Score
from miditok import REMI, TokenizerConfig
/Users/ycardenas/Library/Python/3.9/lib/python/site-packages/urllib3/__init__.py:35: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020 warnings.warn( /Users/ycardenas/Library/Python/3.9/lib/python/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
In [ ]:
random.seed(42)
Symbolic, unconditioned generation¶
In [ ]:
# Parse data
file_path = './maestro-v3.0.0/'
test_files = []
# Set train files to years 2004-2011
train_files = glob.glob(file_path + '2004/*.midi')
train_files += glob.glob(file_path + '2006/*.midi')
train_files += glob.glob(file_path + '2008/*.midi')
train_files += glob.glob(file_path + '2009/*.midi') # including the rest makes training take 2 min / epoch (very slow!)
train_files += glob.glob(file_path + '2011/*.midi')
# Set test files to years 2013-2018
test_files = glob.glob(file_path + '2013/*.midi')
test_files += glob.glob(file_path + '2014/*.midi')
test_files += glob.glob(file_path + '2015/*.midi')
test_files += glob.glob(file_path + '2017/*.midi')
test_files += glob.glob(file_path + '2018/*.midi')
print(len(train_files))
print(len(test_files))
In [ ]:
# Build Markov chain from training data
from miditok import REMI
from miditoolkit import MidiFile
from collections import defaultdict, Counter
import glob
# Initialize REMI tokenizer
tokenizer = REMI()
# Get all training MIDI file paths from MAESTRO v3.0.0 (years 2004–2011)
train_files = glob.glob('./maestro-v3.0.0/**/**/*.midi', recursive=True)
train_files = [f for f in train_files if any(str(y) in f for y in range(2004, 2012))]
# Initialize transition counter: token -> next_token -> count
transitions = defaultdict(Counter)
# Iterate over MIDI files and count token transitions
for file in train_files:
encodings = tokenizer.encode(file) # returns list of Encoding objects
for encoding in encodings:
ids = encoding.ids
for i in range(len(ids) - 1):
transitions[ids[i]][ids[i + 1]] += 1
# Normalize transition counts into probabilities
markov_model = {
k: {kk: vv / sum(v.values()) for kk, vv in v.items()}
for k, v in transitions.items()
}
print(f"Built Markov model with {len(markov_model)} unique starting tokens.")
In [9]:
# Define function to generate a sequence using the Markov chain model
def generate_markov_sequence(start_token, length=200):
sequence = [start_token]
for _ in range(length - 1):
curr = sequence[-1]
next_tokens = markov_model.get(curr, {})
if not next_tokens:
break
next_token = random.choices(
list(next_tokens.keys()), weights=list(next_tokens.values()), k=1
)[0]
sequence.append(next_token)
return sequence
In [18]:
from midi2audio import FluidSynth # Import library
from IPython.display import Audio, display
fs = FluidSynth("FluidR3Mono_GM.sf3") # Initialize FluidSynth for MIDI to ausio convertion
In [ ]:
from miditok import TokSequence
# Generate a sequence using the Markov chain
start = random.choice(list(markov_model.keys()))
generated_ids = generate_markov_sequence(start)
# Wrap in a TokSequence and decode to ScoreTick
seq = TokSequence(ids=generated_ids)
try:
score = tokenizer.decode([seq]) # returns symusic ScoreTick
score.dump_midi("markov.mid") # save directly to MIDI
fs.midi_to_audio("markov.mid", "markov.wav")
print("Saved markov.mid ✅")
except KeyError as e:
print("❌ Decode failed due to invalid token ID:", e)
FluidSynth runtime version 2.4.6 Copyright (C) 2000-2025 Peter Hanappe and others. Distributed under the LGPL license. SoundFont(R) is a registered trademark of Creative Technology Ltd. Rendering audio to file 'symbolic_unconditioned.wav'.. Saved symbolic_unconditioned.mid ✅
In [22]:
from miditok.pytorch_data import DatasetMIDI, DataCollator
tokenizer = REMI() # using defaults parameters (constants.py)
train_dataset = DatasetMIDI(
files_paths=train_files,
tokenizer=tokenizer,
max_seq_len=1024,
bos_token_id=tokenizer["BOS_None"],
eos_token_id=tokenizer["EOS_None"],
)
test_dataset = DatasetMIDI(
files_paths=test_files,
tokenizer=tokenizer,
max_seq_len=1024,
bos_token_id=tokenizer["BOS_None"],
eos_token_id=tokenizer["EOS_None"],
)
collator = DataCollator(tokenizer.pad_token_id)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collator)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=collator)
Define RNN model (LSTM)¶
In [24]:
class MusicRNN(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
super(MusicRNN, self).__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.rnn = nn.LSTM(
input_size=embedding_dim,
hidden_size=hidden_dim,
num_layers=num_layers,
batch_first=True
)
self.fc = nn.Linear(hidden_dim, vocab_size)
def forward(self, x, hidden=None):
# x: (batch_size, seq_length)
x = self.embedding(x) # (batch_size, seq_length, embedding_dim)
out, hidden = self.rnn(x, hidden) # out: (batch_size, seq_length, hidden_dim)
out = self.fc(out) # (batch_size, seq_length, vocab_size)
return out, hidden
Define training function¶
In [25]:
def train(model, train_loader, val_loader, vocab_size, num_epochs=10, lr=0.001, device='mps'): # change device depending on system
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
for epoch in range(num_epochs):
# --------- Training ---------
model.train()
total_train_loss = 0
for batch in train_loader:
batch = batch['input_ids'].to(device) # (batch_size, seq_length)
inputs = batch[:, :-1]
targets = batch[:, 1:]
optimizer.zero_grad()
outputs, _ = model(inputs)
outputs = outputs.reshape(-1, vocab_size)
targets = targets.reshape(-1)
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()
total_train_loss += loss.item()
avg_train_loss = total_train_loss / len(train_loader)
# --------- Validation ---------
model.eval()
total_val_loss = 0
with torch.no_grad():
for batch in val_loader:
batch = batch['input_ids'].to(device)
inputs = batch[:, :-1]
targets = batch[:, 1:]
outputs, _ = model(inputs)
outputs = outputs.reshape(-1, vocab_size)
targets = targets.reshape(-1)
loss = criterion(outputs, targets)
total_val_loss += loss.item()
avg_val_loss = total_val_loss / len(val_loader)
print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
# Example usage
if __name__ == "__main__":
vocab_size = tokenizer.vocab_size
embedding_dim = 256
hidden_dim = 512
num_layers = 2
model = MusicRNN(vocab_size, embedding_dim, hidden_dim, num_layers)
train(model, train_loader, test_loader, vocab_size)
Epoch 1/10 | Train Loss: 3.1702 | Val Loss: 2.6880 Epoch 2/10 | Train Loss: 2.5999 | Val Loss: 2.5081 Epoch 3/10 | Train Loss: 2.4480 | Val Loss: 2.4078 Epoch 4/10 | Train Loss: 2.3212 | Val Loss: 2.3390 Epoch 5/10 | Train Loss: 2.2051 | Val Loss: 2.3059 Epoch 6/10 | Train Loss: 2.0832 | Val Loss: 2.2970 Epoch 7/10 | Train Loss: 1.9584 | Val Loss: 2.3091 Epoch 8/10 | Train Loss: 1.8204 | Val Loss: 2.3500 Epoch 9/10 | Train Loss: 1.6794 | Val Loss: 2.4106 Epoch 10/10 | Train Loss: 1.5352 | Val Loss: 2.4966
Define sampling function¶
In [27]:
def sample(model, start_token, max_length=100, temperature=1.0, device='mps'):
model = model.to(device)
model.eval()
generated = [start_token]
input_token = torch.tensor([[start_token]], device=device) # (1, 1)
hidden = None
for _ in range(max_length):
output, hidden = model(input_token, hidden) # output: (1, 1, vocab_size)
output = output[:, -1, :] # take the last output
output = output / temperature # adjust randomness
probs = F.softmax(output, dim=-1) # (1, vocab_size)
next_token = torch.multinomial(probs, num_samples=1).item()
generated.append(next_token)
if next_token == 2 or next_token == 0: # reach end of sequence
break
input_token = torch.tensor([[next_token]], device=device)
return generated
start_token = tokenizer.special_tokens_ids[1]
generated_sequence = sample(model, start_token, max_length=1024)
print("Generated token sequence:")
print(generated_sequence)
Generated token sequence: [1, 4, 206, 49, 101, 130, 208, 50, 101, 125, 215, 47, 99, 125, 219, 50, 97, 131, 54, 103, 127, 4, 192, 57, 104, 145, 205, 49, 101, 132, 206, 45, 101, 127, 213, 52, 101, 133, 215, 49, 100, 133, 40, 99, 132, 220, 45, 101, 133, 4, 189, 42, 97, 132, 198, 45, 99, 136, 203, 42, 99, 126, 208, 54, 103, 129, 212, 50, 96, 126, 215, 47, 98, 128, 220, 54, 105, 146, 4, 192, 46, 99, 129, 196, 50, 101, 130, 201, 54, 104, 134, 206, 53, 105, 133, 209, 49, 100, 127, 213, 55, 102, 129, 214, 50, 99, 126, 217, 54, 104, 127, 219, 50, 101, 126, 4, 189, 47, 99, 128, 54, 102, 127, 191, 38, 98, 128, 193, 50, 99, 128, 198, 49, 104, 129, 200, 45, 98, 128, 42, 98, 128, 205, 54, 103, 134, 50, 102, 133, 206, 21, 101, 128, 213, 21, 100, 127, 217, 54, 104, 127, 219, 105, 102, 128, 4, 191, 53, 103, 127, 47, 100, 125, 194, 54, 102, 126, 197, 35, 101, 135, 42, 99, 128, 200, 56, 95, 127, 204, 51, 99, 126, 43, 99, 127, 207, 54, 104, 136, 211, 45, 98, 128, 213, 35, 99, 131, 51, 97, 125, 215, 52, 96, 131, 218, 48, 100, 125, 220, 42, 98, 127, 4, 192, 54, 104, 141, 197, 39, 101, 147, 49, 100, 126, 200, 42, 98, 126, 202, 45, 102, 125, 204, 49, 101, 127, 208, 54, 104, 133, 209, 49, 98, 134, 212, 47, 102, 126, 215, 38, 101, 125, 35, 99, 133, 218, 54, 107, 131, 220, 50, 103, 125, 4, 191, 35, 101, 137, 55, 106, 131, 194, 52, 105, 127, 196, 55, 110, 127, 198, 52, 107, 127, 201, 46, 104, 129, 35, 101, 132, 205, 54, 109, 125, 42, 102, 126, 207, 48, 104, 125, 208, 52, 110, 126, 48, 102, 125, 213, 57, 111, 126, 35, 102, 129, 44, 97, 127, 214, 53, 102, 125, 215, 37, 99, 127, 4, 190, 59, 107, 127, 40, 102, 125, 191, 50, 104, 125, 47, 103, 125, 193, 61, 109, 127, 195, 49, 103, 127, 197, 61, 108, 127, 198, 59, 98, 127, 200, 46, 100, 126, 203, 37, 100, 129, 205, 61, 108, 127, 206, 52, 101, 125, 208, 49, 100, 126, 211, 62, 108, 127, 35, 102, 128, 49, 101, 125, 212, 40, 103, 125, 215, 47, 105, 126, 45, 105, 126, 217, 59, 109, 127, 218, 52, 105, 126, 220, 54, 107, 126, 44, 104, 126, 4, 189, 55, 108, 132, 190, 45, 99, 126, 191, 47, 105, 127, 192, 59, 107, 125, 47, 106, 125, 194, 57, 108, 127, 52, 105, 125, 195, 43, 106, 125, 197, 49, 105, 126, 57, 108, 126, 198, 61, 108, 129, 49, 102, 125, 199, 52, 103, 125, 200, 59, 106, 126, 201, 61, 95, 125, 202, 38, 102, 126, 204, 45, 108, 129, 49, 109, 127, 40, 104, 131, 206, 59, 108, 126, 208, 58, 109, 125, 37, 109, 130, 210, 57, 109, 125, 211, 45, 108, 129, 61, 111, 126, 213, 59, 111, 126, 214, 56, 114, 125, 216, 59, 113, 127, 217, 47, 116, 126, 218, 61, 114, 127, 220, 49, 111, 128, 4, 189, 61, 112, 134, 190, 38, 104, 127, 193, 41, 102, 125, 196, 49, 110, 127, 198, 50, 109, 126, 199, 28, 101, 130, 200, 62, 108, 127, 50, 102, 127, 47, 102, 127, 202, 61, 108, 126, 203, 45, 102, 134, 204, 69, 111, 126, 57, 105, 125, 205, 38, 102, 125, 207, 33, 105, 125, 208, 60, 110, 126, 209, 45, 105, 125, 210, 62, 108, 126, 211, 26, 101, 129, 213, 59, 109, 127, 215, 57, 107, 126, 216, 59, 108, 126, 218, 52, 103, 125, 48, 99, 125, 220, 59, 106, 133, 4, 189, 45, 100, 127, 35, 100, 128, 42, 100, 134, 190, 42, 97, 126, 193, 56, 106, 134, 201, 33, 99, 125, 202, 35, 93, 128, 59, 107, 134, 208, 57, 108, 131, 209, 47, 99, 127, 211, 59, 111, 144, 215, 35, 100, 127, 219, 42, 101, 126, 4, 193, 47, 99, 126, 194, 63, 112, 138, 198, 35, 100, 128, 47, 99, 127, 202, 44, 95, 125, 203, 47, 93, 125, 205, 40, 102, 125, 207, 45, 98, 127, 209, 47, 102, 125, 212, 52, 105, 126, 213, 59, 109, 128, 40, 101, 127, 219, 37, 99, 128, 65, 107, 133, 220, 53, 100, 131, 4, 191, 47, 99, 127, 35, 100, 127, 196, 40, 100, 125, 40, 102, 126, 199, 49, 103, 125, 40, 102, 125, 204, 59, 110, 135, 207, 35, 100, 128, 40, 102, 127, 210, 40, 101, 125, 212, 47, 100, 125, 216, 49, 105, 125, 218, 47, 99, 125, 59, 106, 132, 4, 189, 35, 100, 126, 193, 42, 101, 126, 54, 105, 129, 38, 99, 125, 194, 47, 99, 125, 197, 42, 100, 125, 35, 100, 125, 199, 59, 107, 128, 201, 35, 101, 128, 47, 101, 126, 204, 40, 102, 125, 54, 108, 129, 205, 47, 101, 126, 210, 35, 102, 125, 59, 109, 127, 211, 47, 103, 125, 214, 42, 102, 125, 215, 47, 103, 125, 218, 30, 98, 125, 219, 46, 103, 125, 47, 100, 125, 4, 191, 33, 102, 125, 69, 110, 141, 192, 40, 103, 126, 49, 102, 125, 42, 98, 125, 195, 49, 104, 125, 198, 40, 103, 126, 53, 107, 125, 201, 49, 105, 125, 202, 35, 100, 125, 45, 99, 125, 203, 61, 109, 131, 49, 100, 125, 206, 47, 103, 125, 207, 42, 102, 125, 45, 98, 125, 210, 50, 108, 127, 47, 103, 125, 212, 47, 106, 125, 213, 50, 107, 125, 214, 47, 103, 126, 215, 47, 105, 125, 216, 50, 104, 127, 47, 107, 125, 217, 50, 105, 125, 219, 54, 111, 130, 47, 105, 125, 220, 50, 108, 130, 4, 191, 47, 105, 125, 192, 50, 107, 125, 193, 54, 108, 127, 47, 107, 125, 195, 42, 105, 126, 197, 47, 107, 125, 54, 107, 125, 198, 51, 108, 126, 199, 46, 106, 126, 200, 50, 108, 126, 201, 47, 108, 125, 202, 32, 108, 128]
In [ ]:
output_score = tokenizer.tokens_to_midi([generated_sequence])
output_score.dump_midi(f"symbolic_unconditioned.mid")
fs.midi_to_audio("symbolic_unconditioned.mid", "symbolic_unconditioned.wav")
display(Audio("symbolic_unconditioned.wav"))
/var/folders/2y/r1p8gt5d4vxd06pv8k49c87m0000gn/T/ipykernel_30668/72160659.py:1: UserWarning: miditok: The `tokens_to_midi` method had been renamed `decode`. It is now depreciated and will be removed in future updates. output_score = tokenizer.tokens_to_midi([generated_sequence])
FluidSynth runtime version 2.4.6 Copyright (C) 2000-2025 Peter Hanappe and others. Distributed under the LGPL license. SoundFont(R) is a registered trademark of Creative Technology Ltd. Rendering audio to file 'rnn.wav'..
Symbolic, conditioned generation (harmonization)¶
In [30]:
# Define simple harmonization model with LSTM
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
# Define a custom dataset cleass to prepare data for molody -> chord harmonization
class HarmonizationDataset(Dataset):
def __init__(self, file_paths, tokenizer):
self.data = []
# Iterate through MIDI files, limit to 100 for speed
for path in file_paths[:100]: # limit to 100 for speed
try:
encodings = tokenizer.encode(path)
ids = []
for encoding in encodings:
ids.extend(encoding.ids) # flatten all token segments into a single sequence
# Slide over tokensized MIDI to generate melody-chord pairs
for i in range(len(ids) - 16):
melody = ids[i:i+8] # First 8 tokens are melody
chords = ids[i+8:i+16] # Next 8 tokens are chords
self.data.append((melody, chords))
except Exception as e:
print(f"Skipping file {path} due to error: {e}")
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
melody, chords = self.data[idx]
return torch.tensor(melody), torch.tensor(chords)
# Define the LSTM based harmonization model
class HarmonizationModel(nn.Module):
def __init__(self, vocab_size, embed_dim=128, hidden_dim=256):
super().__init__()
# Embedding layer transforms token IDs to dense vectors
self.embed = nn.Embedding(vocab_size, embed_dim)
# LSTM captures sequential patterns in melody and chord progression
self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
# Linear layer maps LSTM output to chord token probabilities
self.fc = nn.Linear(hidden_dim, vocab_size)
def forward(self, x):
x = self.embed(x)
output, _ = self.lstm(x)
logits = self.fc(output)
return logits
In [31]:
# Create dataset and loader using train_files
dataset = HarmonizationDataset(train_files, tokenizer)
loader = DataLoader(dataset, batch_size=32, shuffle=True)
# Initialize model, loss, and optimizer
model = HarmonizationModel(tokenizer.vocab_size)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()
# Simple training loop
for epoch in range(7):
model.train()
train_losses = []
for melody, chords in loader:
preds = model(melody)
loss = loss_fn(preds.view(-1, preds.shape[-1]), chords.view(-1))
optimizer.zero_grad()
loss.backward()
optimizer.step()
train_losses.append(loss.item())
avg_train_loss = sum(train_losses) / len(train_losses)
print(f"Epoch {epoch+1}/10 | Train Loss: {avg_train_loss:.4f}")
Epoch 1/10 | Train Loss: 3.7491 Epoch 2/10 | Train Loss: 3.6946 Epoch 3/10 | Train Loss: 3.6709 Epoch 4/10 | Train Loss: 3.6546 Epoch 5/10 | Train Loss: 3.6428 Epoch 6/10 | Train Loss: 3.6340 Epoch 7/10 | Train Loss: 3.6272
In [32]:
# Select melody sample
melody, _ = dataset[0]
# Use trained model to predict chords
model.eval()
with torch.no_grad():
preds = model(melody.unsqueeze(0))
pred_ids = preds.argmax(dim=-1).squeeze().tolist() # Get predicted chord token IDs
In [33]:
from miditok import TokSequence
# Combine melody and predicted chords into one list of IDs
combined = melody.tolist() + pred_ids
# Wrap in a TokSequence
seq = TokSequence(ids=combined)
# Decode into a ScoreTick
score = tokenizer.decode([seq]) # returns ScoreTick
# Dump the ScoreTick to a .mid file
score.dump_midi("symbolic_conditioned.mid")
fs.midi_to_audio("symbolic_conditioned.mid", "symbolic_conditioned.wav")
print("✅ Saved symbolic_conditioned.mid")
FluidSynth runtime version 2.4.6 Copyright (C) 2000-2025 Peter Hanappe and others. Distributed under the LGPL license. SoundFont(R) is a registered trademark of Creative Technology Ltd. Rendering audio to file 'symbolic_conditioned.wav'.. ✅ Saved symbolic_conditioned.mid
In [37]:
display(Audio("symbolic_conditioned.wav"))
In [35]:
display(Audio("symbolic_unconditioned.wav"))
In [36]:
display(Audio("rnn.wav"))